Загрузим данные и необходимые пакеты (№1)

insurance <- read.csv("insurance_cost.csv")
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
library(dplyr)
library(ggplot2)
library(plotly)
library(ggbiplot)
library(ggpubr)
library(corrplot)
library(corrr)
library(caret)
library(factoextra)

#График ИМТ-траты в plotly (№2)

plot_ly(
  data = insurance,
  x = ~ bmi,
  y = ~ charges,
  color = ~ smoker)

А теперь в ggplotly (№3)

plot <- insurance %>%
  ggplot(aes(x = bmi, y = charges, color = smoker)) +
  geom_point(size = 1.5) +
  theme_light()

ggplotly(plot)

Корреляционный анализ (№4)

insurance_for_cor <- insurance %>%
  select(is.integer | is.numeric)
## Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
##   # Was:
##   data %>% select(is.integer)
## 
##   # Now:
##   data %>% select(where(is.integer))
## Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
##   # Was:
##   data %>% select(is.numeric)
## 
##   # Now:
##   data %>% select(where(is.numeric))
insurance_cor <- cor(insurance_for_cor)
insurance_cor
##                age   children       bmi    charges
## age      1.0000000 0.04246900 0.1092719 0.29900819
## children 0.0424690 1.00000000 0.0127589 0.06799823
## bmi      0.1092719 0.01275890 1.0000000 0.19834097
## charges  0.2990082 0.06799823 0.1983410 1.00000000

Визуализируем матрицу

corrplot(insurance_cor, method = "color", order = "alphabet", type = "upper")

И другим способом

corrplot.mixed(insurance_cor, lower = "color", upper = "pie", order = "AOE")

И ещё одним

insurance_cor %>%
  rplot()

Поработаем с датафреймом (№5)

# Сначала отберем все номинативные переменные и превратим их в бинарные

dummy <- dummyVars(" ~ sex + smoker + region", data = insurance)

dummy_insurance <- data.frame(predict(dummy, newdata = insurance))

# И объединим с оставшимися нумерическими переменными из оригинального датафрейма

other_insurance <- insurance %>%
  select("age", "bmi", "children", "charges")

new_insurance <- other_insurance %>% bind_cols(dummy_insurance)

glimpse(new_insurance)
## Rows: 1,338
## Columns: 12
## $ age             <int> 19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56…
## $ bmi             <dbl> 27.900, 33.770, 33.000, 22.705, 28.880, 25.740, 33.440…
## $ children        <int> 0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ charges         <dbl> 16884.924, 1725.552, 4449.462, 21984.471, 3866.855, 37…
## $ sexfemale       <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, …
## $ sexmale         <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ smokerno        <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, …
## $ smokeryes       <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ regionnortheast <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, …
## $ regionnorthwest <dbl> 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ regionsoutheast <dbl> 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ regionsouthwest <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, …

Иерархическая классификация

# Стандартизуем значения переменных

insurance_scaled <- scale(new_insurance)

# И найдем дистанции

insurance_dist <- dist(insurance_scaled, method = "euclidean")
as.matrix(insurance_dist)[1:6, 1:6]
##          1        2        3        4        5        6
## 1 0.000000 5.825239 6.253322 5.747217 5.759522 4.978144
## 2 5.825239 0.000000 1.823634 4.289327 3.582563 3.361726
## 3 6.253322 1.823634 0.000000 4.663256 4.148789 3.956548
## 4 5.747217 4.289327 4.663256 0.000000 1.807952 4.583438
## 5 5.759522 3.582563 4.148789 1.807952 0.000000 4.329507
## 6 4.978144 3.361726 3.956548 4.583438 4.329507 0.000000
# Высчитываем дендрограмму

insurance_hc <- hclust(d = insurance_dist, 
                        method = "ward.D2")

# И визуализируем

fviz_dend(insurance_hc, cex = 0.1)